#---------------------------------#
# Restrictive Internet Regulation #
# Analysis Script                 #
# last updated: 15.01.2025        #
#---------------------------------#

# Preparation #
#-------------#

# clear workspace
rm(list = ls())

# restart R
.rs.restartR()

# load pacman
if(!require("pacman")) install.packages("pacman")
# load packages 
p_load(tidyverse, lmtest,  MASS, sandwich, jtools, broom, stargazer, corrplot, broom, margins, gridExtra, 
       sf, rnaturalearth, ggpattern, ggeffects, patchwork) 

# set working directory if needed 
# setwd("")
# or open project for directory 

# load data 
df <- read.csv("243884816.R1_df.csv")
# restrict sample to 2001 - 2020 for statistical analysis 
df <- df %>% filter(year > 2000 & year <= 2020)


#### STEP 1 - Summary Stats ---------------------------------------------------------------------------------
## Summary Statistics and intercorrelation check ##

# select vars for analysis 
names(df)

## 1.1 inter correlation matrix of all IVs in main mdel 
independent_vars_df <- df[, c("protests_cgov_lag", "any_conflict_lag", 
                              "v2csreprss_ord_lag", "v2exfxtmhs_3_lag", 
                              "weighted_protests_cgov_950_lag", "weighted_any_conflict_950_lag",
                              "weighted_reg_res_numerical_950_lag", 
                              "total_pop_log", "i_users_lag", 
                              "youth_bulge", "urban_pop")]

# Compute the correlation matrix
cor_matrix <- cor(independent_vars_df, use = "complete.obs")  
# View the correlation matrix
print(cor_matrix)
# visualize:
corrplot(cor_matrix, method = "circle")


## 1.2 Summary Stats Main Model 
df_a <- df %>% 
  dplyr::select(reg_res, protests_cgov_lag, any_conflict_lag, 
                v2csreprss_ord_lag, v2exfxtmhs_3_lag, 
                weighted_protests_cgov_950_lag, weighted_any_conflict_950_lag,
                weighted_reg_res_numerical_950_lag, 
                total_pop_log, i_users_lag, 
                youth_bulge, urban_pop) 

# stargazer Output
stargazer(as.data.frame(df_a), type = "latex", summary.stat = c("mean", "sd", "min", "max", "n"), 
          title = "Summary Statistics", 
          covariate.labels = c("Restrictive Internet Regulation", 
                               "Anti-Government Protests", 
                               "Armed Conflict Internal", 
                               "CSO Repression",
                               "3 Years Post Change HOS Term Limit",
                               "Protest in Region", 
                               "Armed Conflict in Region",
                               "Restrictive I. Regulation in Region",
                               "Population (ln)",
                               "Share of Internet Users",
                               "Youth Bulge",
                               "Share of Urban Population")) 
# discrepancy in N because most datasets do not record data for South Sudan in 2011 


#### STEP 2 - Main Model --------------------------------------------------------------------------------------
# OLS / Linear Probability Model with unit and time fixed effects, 2001-2020 sub-Saharan African countries

### Deterring Internal Threat 
summary(m1_ols <- lm(reg_res ~ protests_cgov_lag + any_conflict_lag + 
               total_pop_log + i_users_lag + youth_bulge + urban_pop + 
               factor(country) + factor(year),
             data = df))

### Expanding Analog Regulation 
summary(m2_ols <- lm(reg_res ~ v2csreprss_ord_lag + v2exfxtmhs_3_lag + 
               total_pop_log + i_users_lag + youth_bulge + urban_pop + 
               factor(country) + factor(year),
             data = df)) 

### Fending Off Threats Abroad 
summary(m3_ols <- lm(reg_res ~ weighted_protests_cgov_950_lag + weighted_any_conflict_950_lag + 
               total_pop_log + i_users_lag + youth_bulge + urban_pop + 
               factor(country) + factor(year),
             data = df)) 

### Learning from Neighbors
summary(m4_ols <- lm(reg_res ~ weighted_reg_res_numerical_950_lag + 
               total_pop_log + i_users_lag + youth_bulge + urban_pop + 
               factor(country) + factor(year),
             data = df)) 

### All together 
summary(m5_ols <- lm(reg_res ~ protests_cgov_lag + any_conflict_lag +
               v2csreprss_ord_lag + v2exfxtmhs_3_lag + 
               weighted_protests_cgov_950_lag + weighted_any_conflict_950_lag +  
               weighted_reg_res_numerical_950_lag + 
               total_pop_log + i_users_lag + youth_bulge + urban_pop + 
               factor(country) + factor(year),
             data = df))

### export with stargazer 
stargazer(m1_ols, m2_ols, m3_ols, m4_ols, m5_ols, type = "latex",
          omit = c("country", "year"),  
          covariate.labels = c(
                               "Anti-Government Protests", 
                               "Armed Conflict Internal", 
                               "CSO Repression",
                               "3 Years Post Change HOS Term Limit",
                               "Protest in Region", 
                               "Armed Conflict in Region",
                               "Restrictive I. Regulation in Region",
                               "Population (ln)",
                               "Share of Internet Users",
                               "Youth Bulge",
                               "Share of Urban Population"
          ),
          dep.var.labels = c("Potentially Restrictive Internet Legislation")
)


#### STEP 3 - Visualize Results -----------------------------------------------------------------------------

### 3.1 Marginal Effects of all variables in one plot 

margins_m1 <- margins(m1_ols, variables = c("protests_cgov_lag", "any_conflict_lag"))
margins_m2 <- margins(m2_ols, variables = c("v2csreprss_ord_lag", "v2exfxtmhs_3_lag"))
margins_m3 <- margins(m3_ols, variables = c("weighted_protests_cgov_950_lag", "weighted_any_conflict_950_lag"))
margins_m4 <- margins(m4_ols, variables = "weighted_reg_res_numerical_950_lag")

# Summarize the marginal effects to get average marginal effects
summary_m1 <- summary(margins_m1)
summary_m2 <- summary(margins_m2)
summary_m3 <- summary(margins_m3)
summary_m4 <- summary(margins_m4)

# Convert summaries to data frames
effects_m1 <- as.data.frame(summary_m1)
effects_m2 <- as.data.frame(summary_m2)
effects_m3 <- as.data.frame(summary_m3)
effects_m4 <- as.data.frame(summary_m4)

# Filter only the variables of interest
effects_m1 <- effects_m1 %>% filter(factor %in% c("protests_cgov_lag", "any_conflict_lag")) %>% mutate(model = "Model 1")
effects_m2 <- effects_m2 %>% filter(factor %in% c("v2csreprss_ord_lag", "v2exfxtmhs_3_lag")) %>% mutate(model = "Model 2")
effects_m3 <- effects_m3 %>% filter(factor %in% c("weighted_protests_cgov_950_lag", "weighted_any_conflict_950_lag")) %>% mutate(model = "Model 3")
effects_m4 <- effects_m4 %>% filter(factor == "weighted_reg_res_numerical_950_lag") %>% mutate(model = "Model 4")

# Combine all marginal effects into a single data frame
combined_margins_df <- bind_rows(effects_m1, effects_m2, effects_m3, effects_m4)

combined_margins_df$factor <- factor(combined_margins_df$factor, levels = 
                                       rev(c("protests_cgov_lag", "any_conflict_lag", 
                                             "v2csreprss_ord_lag", "v2exfxtmhs_3_lag", 
                                             "weighted_protests_cgov_950_lag", "weighted_any_conflict_950_lag", 
                                             "weighted_reg_res_numerical_950_lag"
)),
labels = rev(c( "Anti-Government Protests", 
                "Armed Conflict Internal", 
                "CSO Repression",
                "3 Years Post Change HOS Term Limit",
                "Protest in Region", 
                "Armed Conflict in Region",
                "Restrictive I. Regulation in Region"
)))

ggplot(combined_margins_df, aes(x = factor, y = AME, ymin = lower, ymax = upper, color = model)) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red", size = 0.5) +
  geom_point(position = position_dodge(width = 0.25), size = 3, color = "black") +
  geom_errorbar(position = position_dodge(width = 0.25), width = 0.05, color = "black") +
  coord_flip() + 
  labs(y = "Marginal effect on likelihood of restrictive regulation", x = "", title = "") +
  theme_minimal() +
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_rect(fill = "white", colour = "black", size = 0.5),
        axis.title = element_text(size = 18, family = "Times New Roman"),
        axis.text.x = element_text(hjust = 1, size = 18, family = "Times New Roman"),
        axis.text.y = element_text(size = 18, family = "Times New Roman"),
        legend.position = "none") +
  scale_y_continuous(
    limits = c(-0.525, 0.525), 
    breaks = c(-0.5, -0.25, 0, 0.25, 0.5)
  )

# Save the plot
# ggsave("marginal-effects_all_m5.png", plot = last_plot(), width = 10, height = 7, units = "in", dpi = 300)


### 3.2 predicted probabilities protests_cgov_lag and any_conflict_lag (model 1)

# Filter dataset to only include rows with data until 2018 (due to protests_cgov_lag)
df_filtered <- df %>%
  filter(year <= 2018 & !is.na(protests_cgov_lag))

# Convert year and country to factors
df_filtered$year <- as.factor(df_filtered$year)
df_filtered$country <- as.factor(df_filtered$country)

# estimate new model 
summary(m1_ols <- lm(reg_res ~ protests_cgov_lag + any_conflict_lag +
                       total_pop_log + i_users_lag + youth_bulge + urban_pop + 
                       factor(country) + factor(year),
                     data = df_filtered))

## protest 
# Create a new dataset for predictions
new_data_protests <- expand.grid(
  protests_cgov_lag = seq(min(df_filtered$protests_cgov_lag, na.rm = TRUE),
                          max(df_filtered$protests_cgov_lag, na.rm = TRUE), length.out = 100),
  any_conflict_lag = mean(df_filtered$any_conflict_lag, na.rm = TRUE),
  total_pop_log = mean(df_filtered$total_pop_log, na.rm = TRUE),
  i_users_lag = mean(df_filtered$i_users_lag, na.rm = TRUE),
  youth_bulge = mean(df_filtered$youth_bulge, na.rm = TRUE),
  urban_pop = mean(df_filtered$urban_pop, na.rm = TRUE),
  country = levels(m1_ols$model$`factor(country)`),  
  year = levels(m1_ols$model$`factor(year)`)    
)

# Generate predictions
pred_protests <- predict(m1_ols, newdata = new_data_protests, se.fit = TRUE)

# Combine predictions with the input data
pred_protests_df <- data.frame(
  protests_cgov_lag = new_data_protests$protests_cgov_lag,
  country = new_data_protests$country,
  year = new_data_protests$year,
  predicted = pred_protests$fit,
  lower = pred_protests$fit - 1.96 * pred_protests$se.fit,
  upper = pred_protests$fit + 1.96 * pred_protests$se.fit
)

# Aggregate predictions across all countries and years
pred_protests_summary <- pred_protests_df %>%
  group_by(protests_cgov_lag) %>%
  summarise(
    mean_predicted = mean(predicted, na.rm = TRUE),
    lower = mean(lower, na.rm = TRUE),
    upper = mean(upper, na.rm = TRUE)
  )

# Visualize the averaged predictions
p_protests <- ggplot(pred_protests_summary, aes(x = protests_cgov_lag, y = mean_predicted)) +
  geom_line(linewidth = 1, color = "black") +
  geom_ribbon(aes(ymin = lower, ymax = upper), alpha = 0.2, fill = "grey") +
  labs(
    x = "Protests Against Government",
    y = "Predicted Probability of Restrictive Regulation",
    title = ""
  ) +
  theme_minimal() +
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_rect(fill = "white", colour = "black", linewidth = 0.5),
    axis.title = element_text(size = 18, family = "Times New Roman"),
    axis.text.x = element_text(hjust = 1, size = 18, family = "Times New Roman"),
    axis.text.y = element_text(size = 18, family = "Times New Roman"),
    legend.position = "none"
  )

## conflict 
new_data_conflict <- expand.grid(
  protests_cgov_lag = mean(df_filtered$protests_cgov_lag, na.rm = TRUE),
  any_conflict_lag = c(0, 1),
  total_pop_log = mean(df_filtered$total_pop_log, na.rm = TRUE),
  i_users_lag = mean(df_filtered$i_users_lag, na.rm = TRUE),
  youth_bulge = mean(df_filtered$youth_bulge, na.rm = TRUE),
  urban_pop = mean(df_filtered$urban_pop, na.rm = TRUE),
  country = levels(df_filtered$country),  # Include all countries
  year = levels(df_filtered$year)         # Include all years
)

# Generate predictions using the model
pred_conflict <- predict(m1_ols, newdata = new_data_conflict, se.fit = TRUE)

# Combine predictions with the input data
pred_conflict_df <- data.frame(
  any_conflict_lag = new_data_conflict$any_conflict_lag,
  country = new_data_conflict$country,
  year = new_data_conflict$year,
  predicted = pred_conflict$fit,
  lower = pred_conflict$fit - 1.96 * pred_conflict$se.fit,
  upper = pred_conflict$fit + 1.96 * pred_conflict$se.fit
)

# Aggregate predictions to focus on `any_conflict_lag`
pred_conflict_summary <- pred_conflict_df %>%
  group_by(any_conflict_lag) %>%
  summarise(
    mean_predicted = mean(predicted),
    lower = mean(lower),
    upper = mean(upper)
  )

p_conflict <- ggplot(pred_conflict_summary, aes(x = factor(any_conflict_lag), y = mean_predicted)) +
  geom_bar(stat = "identity", fill = "grey", color = "black", width = 0.6) +
  geom_errorbar(aes(ymin = lower, ymax = upper), width = 0.2, color = "black") +
  labs(
    x = "0 = No Conflict, 1 = Conflict",
    y = "",
    title = ""
  ) +
  theme_minimal() +
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_rect(fill = "white", colour = "black", size = 0.5),
    axis.title = element_text(size = 18, family = "Times New Roman"),
    axis.text.x = element_text(hjust = 1, size = 18, family = "Times New Roman"),
    axis.text.y = element_text(size = 18, family = "Times New Roman"),
    legend.position = "none"
  )


# Combine plots side by side
combined_plot <- p_protests + p_conflict + plot_annotation(
  title = ""
)

# Save the plot
ggsave("predicted_prob_dettering.png", plot = combined_plot, width = 10, height = 6, dpi = 300)


### 3.3 predicted probabilities v2csreprss_ord_lag, v2exfxtmhs_3_lag, v2x_regime_bin_ea_lag (model 2)

## CSO repression 
# CSO Repression Predictions
new_data_repression <- expand.grid(
  v2csreprss_ord_lag = seq(0, 4, length.out = 100),
  v2exfxtmhs_3_lag = mean(df_filtered$v2exfxtmhs_3_lag, na.rm = TRUE),
  total_pop_log = mean(df_filtered$total_pop_log, na.rm = TRUE),
  i_users_lag = mean(df_filtered$i_users_lag, na.rm = TRUE),
  youth_bulge = mean(df_filtered$youth_bulge, na.rm = TRUE),
  urban_pop = mean(df_filtered$urban_pop, na.rm = TRUE),
  country = levels(m2_ols$model$`factor(country)`),  
  year = levels(m2_ols$model$`factor(year)`)
)

pred_repression <- predict(m2_ols, newdata = new_data_repression, se.fit = TRUE)

pred_repression_df <- data.frame(
  v2csreprss_ord_lag = new_data_repression$v2csreprss_ord_lag,
  country = new_data_repression$country,
  year = new_data_repression$year,
  predicted = pred_repression$fit,
  lower = pred_repression$fit - 1.96 * pred_repression$se.fit,
  upper = pred_repression$fit + 1.96 * pred_repression$se.fit
)

# Summarize predictions
pred_repression_summary <- pred_repression_df %>%
  group_by(v2csreprss_ord_lag) %>%
  summarise(
    mean_predicted = mean(predicted),
    lower = mean(lower),
    upper = mean(upper)
  )

p_repression_line <- ggplot(pred_repression_summary, aes(x = v2csreprss_ord_lag, y = mean_predicted)) +
  geom_line(size = 1, color = "black") +
  geom_ribbon(aes(ymin = lower, ymax = upper), alpha = 0.2, fill = "grey") +
  labs(x = "CSO Repression (0–4)", y = "Predicted Probability of Restrictive Regulation", title = "") +
  theme_minimal() +
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_rect(fill = "white", colour = "black", size = 0.5),
    axis.title = element_text(size = 18, family = "Times New Roman"),
    axis.text.x = element_text(hjust = 1, size = 18, family = "Times New Roman"),
    axis.text.y = element_text(size = 18, family = "Times New Roman"),
    legend.position = "none"
  )

## Changes in term limit HOS
# Presidential Term Limit Predictions
new_data_term_limit <- expand.grid(
  v2exfxtmhs_3_lag = c(0, 1),
  v2csreprss_ord_lag = mean(df_filtered$v2csreprss_ord_lag, na.rm = TRUE),
  total_pop_log = mean(df_filtered$total_pop_log, na.rm = TRUE),
  i_users_lag = mean(df_filtered$i_users_lag, na.rm = TRUE),
  youth_bulge = mean(df_filtered$youth_bulge, na.rm = TRUE),
  urban_pop = mean(df_filtered$urban_pop, na.rm = TRUE),
  country = levels(m2_ols$model$`factor(country)`),  
  year = levels(m2_ols$model$`factor(year)`)
)

pred_term_limit <- predict(m2_ols, newdata = new_data_term_limit, se.fit = TRUE)

pred_term_limit_df <- data.frame(
  v2exfxtmhs_3_lag = new_data_term_limit$v2exfxtmhs_3_lag,
  country = new_data_term_limit$country,
  year = new_data_term_limit$year,
  predicted = pred_term_limit$fit,
  lower = pred_term_limit$fit - 1.96 * pred_term_limit$se.fit,
  upper = pred_term_limit$fit + 1.96 * pred_term_limit$se.fit
)

# Summarize predictions
pred_term_limit_summary <- pred_term_limit_df %>%
  group_by(v2exfxtmhs_3_lag) %>%
  summarise(
    mean_predicted = mean(predicted),
    lower = mean(lower),
    upper = mean(upper)
  )

p_term_limit <- ggplot(pred_term_limit_summary, aes(x = factor(v2exfxtmhs_3_lag), y = mean_predicted)) +
  geom_bar(stat = "identity", fill = "grey", color = "black", width = 0.6) +
  geom_errorbar(aes(ymin = lower, ymax = upper), width = 0.2, color = "black") +
  labs(x = "3 Years Post Change to HOS Term Limit", y = "", title = "") +
  theme_minimal() +
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_rect(fill = "white", colour = "black", size = 0.5),
    axis.title = element_text(size = 18, family = "Times New Roman"),
    axis.text.x = element_text(hjust = 1, size = 18, family = "Times New Roman"),
    axis.text.y = element_text(size = 18, family = "Times New Roman"),
    legend.position = "none"
  )

# Combine Plots
combined_cso_term_autocracy <- p_repression_line + p_term_limit + plot_annotation(
  title = ""
)

# Save the Plot
ggsave("predicted_prob_updating.png", plot = combined_cso_term_autocracy, width = 10, height = 6, dpi = 300)


### 3.4 marginal effects reiterated across distances for weighted_protests_cgov_950_lag weighted_any_conflict_950_lag (model 3)

## SCAD at distances with fixed effects 
# Define the distance thresholds
distance_thresholds <- c(50, 550, 950, 1450, 1950, 2350)

# Initialize a list to store results
results <- list()

# Loop through each distance threshold
for (distance in distance_thresholds) {
  # Define the variable name for the lagged distance measure
  var_name <- paste0("weighted_protests_cgov_", distance, "_lag")
  
  # Fit the OLS model with country and year fixed effects
  model <- lm(
    as.formula(paste0("reg_res ~ ", var_name, "+ weighted_any_conflict_950_lag +  
               + total_pop_log + i_users_lag + youth_bulge + urban_pop + 
               factor(country) + factor(year)")),
    data = df
  )
  
  # Save the model in the list
  results[[as.character(distance)]] <- model
}

# Extract coefficients and standard errors for each model
coefficients <- sapply(distance_thresholds, function(distance) {
  model <- results[[as.character(distance)]]
  summary_model <- summary(model)
  
  # Find the row for the relevant variable
  coef_index <- grep(paste0("weighted_protests_cgov_", distance, "_lag"), rownames(summary_model$coefficients))
  
  # Extract estimate and standard error
  c(Estimate = summary_model$coefficients[coef_index, "Estimate"],
    Std_Error = summary_model$coefficients[coef_index, "Std. Error"])
})

# Convert to a data frame
coeff_df <- data.frame(
  Distance = distance_thresholds,
  Estimate = coefficients["Estimate", ],
  Std_Error = coefficients["Std_Error", ]
)

# Plot
ggplot(coeff_df, aes(x = Distance, y = Estimate)) +
  geom_point() +
  geom_errorbar(aes(ymin = Estimate - 1.96 * Std_Error, ymax = Estimate + 1.96 * Std_Error)) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
  labs(
    x = "Distance Threshold (km)",
    y = "Coefficient Estimate"
  ) +
  theme_minimal() +
  theme(legend.position = "bottom",
        text = element_text(size = 18, family = "Times"),  # Set overall text size to 14 and font to Times Roman
        axis.text.x = element_text(size = 16, family = "Times"),  # Make axis numbers bigger and set font
        axis.text.y = element_text(size = 16, family = "Times"),
        legend.text = element_text(size = 16, family = "Times"),
        legend.title = element_blank())


### UCDP at distances with fixed effects 

# Initialize a list to store results
results <- list()

# Loop through each distance threshold
for (distance in distance_thresholds) {
  # Define the variable name for the lagged distance measure
  var_name <- paste0("weighted_any_conflict_", distance, "_lag")
  
  # Fit the OLS model with country and year fixed effects
  model <- lm(
    as.formula(paste0("reg_res ~ ", var_name, "+ weighted_protests_cgov_950_lag +  
                        + total_pop_log + i_users_lag + youth_bulge + urban_pop + 
                        factor(country) + factor(year)")),
    data = df
  )
  
  # Save the model in the list
  results[[as.character(distance)]] <- model
}

# Extract coefficients and standard errors for each model
coefficients <- sapply(distance_thresholds, function(distance) {
  model <- results[[as.character(distance)]]
  summary_model <- summary(model)
  
  # Find the row for the relevant variable
  coef_index <- grep(paste0("weighted_any_conflict_", distance, "_lag"), rownames(summary_model$coefficients))
  
  # Extract estimate and standard error
  c(Estimate = summary_model$coefficients[coef_index, "Estimate"],
    Std_Error = summary_model$coefficients[coef_index, "Std. Error"])
})

# Convert to a data frame
coeff_df <- data.frame(
  Distance = distance_thresholds,
  Estimate = coefficients["Estimate", ],
  Std_Error = coefficients["Std_Error", ]
)

# Plot
ggplot(coeff_df, aes(x = Distance, y = Estimate)) +
  geom_point() +
  geom_errorbar(aes(ymin = Estimate - 1.96 * Std_Error, ymax = Estimate + 1.96 * Std_Error)) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
  labs(
    x = "Distance Threshold (km)",
    y = "Coefficient Estimate"
  ) +
  theme_minimal() +
  theme(legend.position = "bottom",
        text = element_text(size = 18, family = "Times"),  # Set overall text size to 14 and font to Times Roman
        axis.text.x = element_text(size = 16, family = "Times"),  # Make axis numbers bigger and set font
        axis.text.y = element_text(size = 16, family = "Times"),
        legend.text = element_text(size = 16, family = "Times"),
        legend.title = element_blank())


### 3.5 Learning: marginal effects reiterated across distances (model 4)

# Initialize a list to store results
results <- list()

# Loop through each distance threshold
for (distance in distance_thresholds) {
  # Define the variable name for the lagged distance measure
  var_name <- paste0("weighted_reg_res_numerical_", distance, "_lag")
  
  # Fit the OLS model with country and year fixed effects
  model <- lm(
    as.formula(paste0("reg_res ~ ", var_name, " + 
                        + total_pop_log + i_users_lag + youth_bulge + urban_pop + 
                        factor(country) + factor(year)")),
    data = df
  )
  
  # Save the model in the list
  results[[as.character(distance)]] <- model
}

# Extract coefficients and standard errors for each model
coefficients <- sapply(distance_thresholds, function(distance) {
  model <- results[[as.character(distance)]]
  summary_model <- summary(model)
  
  # Find the row for the relevant variable
  coef_index <- grep(paste0("weighted_reg_res_numerical_", distance, "_lag"), rownames(summary_model$coefficients))
  
  # Extract estimate and standard error
  c(Estimate = summary_model$coefficients[coef_index, "Estimate"],
    Std_Error = summary_model$coefficients[coef_index, "Std. Error"])
})

# Convert to a data frame
coeff_df <- data.frame(
  Distance = distance_thresholds,
  Estimate = coefficients["Estimate", ],
  Std_Error = coefficients["Std_Error", ]
)

# Plot
ggplot(coeff_df, aes(x = Distance, y = Estimate)) +
  geom_point() +
  geom_errorbar(aes(ymin = Estimate - 1.96 * Std_Error, ymax = Estimate + 1.96 * Std_Error)) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
  labs(
    x = "Distance Threshold (km)",
    y = "Coefficient Estimate"
  ) +
  theme_minimal() +
  theme(legend.position = "bottom",
        text = element_text(size = 18, family = "Times"),  # Set overall text size to 14 and font to Times Roman
        axis.text.x = element_text(size = 16, family = "Times"),  # Make axis numbers bigger and set font
        axis.text.y = element_text(size = 16, family = "Times"),
        legend.text = element_text(size = 16, family = "Times"),
        legend.title = element_blank())

## 90% CI 
# Extract coefficients and standard errors
coeff_df_90 <- data.frame(
  Distance = distance_thresholds,
  Estimate = coefficients["Estimate", ],
  Std_Error = coefficients["Std_Error", ],
  CI_Lower = coefficients["Estimate", ] - 1.645 * coefficients["Std_Error", ],
  CI_Upper = coefficients["Estimate", ] + 1.645 * coefficients["Std_Error", ]
)

# Plot with 90% confidence intervals
ggplot(coeff_df_90, aes(x = Distance, y = Estimate)) +
  geom_point() +
  geom_errorbar(aes(ymin = CI_Lower, ymax = CI_Upper)) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
  labs(
    x = "Distance Threshold (km)",
    y = "Coefficient Estimate"
  ) +
  theme_minimal() +
  theme(legend.position = "bottom",
        text = element_text(size = 18, family = "Times"),  # Set overall text size to 14 and font to Times Roman
        axis.text.x = element_text(size = 16, family = "Times"),  # Make axis numbers bigger and set font
        axis.text.y = element_text(size = 16, family = "Times"),
        legend.text = element_text(size = 16, family = "Times"),
        legend.title = element_blank())


#### STEP 4 - Robustness Checks - Different Models ----------------------------------------------------------

### 4.1 Binary Logit Clustered SEs 
# Controls: v2x_regime + total_pop_log + i_users_lag + youth_bulge + urban_pop

## Deterring  
summary(m1_cl <- glm(reg_res ~ protests_cgov_lag + any_conflict_lag +  
                       + total_pop_log + i_users_lag + youth_bulge + urban_pop, 
                     data = df, family = binomial(link = "logit")))
coeftest(m1_cl, vcov. = vcovCL(m1_cl, cluster = df$country, type = "HC0"))

## Expanding   
summary(m2_cl <- glm(reg_res ~  v2csreprss_ord_lag + v2exfxtmhs_3_lag + 
                       + total_pop_log + i_users_lag + youth_bulge + urban_pop,
                     data = df, family = binomial(link = "logit")))
coeftest(m2_cl, vcov. = vcovCL(m2_cl, cluster = df$country, type = "HC0"))

## Fending Off 
summary(m3_cl <- glm(reg_res ~ weighted_protests_cgov_950_lag + weighted_any_conflict_950_lag +  
                       total_pop_log + i_users_lag + youth_bulge + urban_pop,
                     data = df, family = binomial(link = "logit")))
coeftest(m3_cl, vcov. = vcovCL(m3_cl, cluster = df$country, type = "HC0"))

## Learning 
summary(m4_cl <- glm(reg_res ~ weighted_reg_res_numerical_950_lag + 
                       total_pop_log + i_users_lag + youth_bulge + urban_pop,
                     data = df, family = binomial(link = "logit")))
coeftest(m4_cl, vcov. = vcovCL(m4_cl, cluster = df$country, type = "HC0"))

## All together 
summary(m5_cl <- glm(reg_res ~ protests_cgov_lag + any_conflict_lag +
                       v2csreprss_ord_lag + v2exfxtmhs_3_lag + 
                       weighted_protests_cgov_950_lag + weighted_any_conflict_950_lag +  
                       weighted_reg_res_numerical_950_lag +  
                       total_pop_log + i_users_lag + youth_bulge + urban_pop,
                     data = df, family = binomial(link = "logit")))
coeftest(m5_cl, vcov. = vcovCL(m5_cl, cluster = df$country, type = "HC0"))

## export stargazer 
se_m1_cl <- sqrt(diag(vcovCL(m1_cl, cluster = df$country, type = "HC0")))
se_m2_cl <- sqrt(diag(vcovCL(m2_cl, cluster = df$country, type = "HC0")))
se_m3_cl <- sqrt(diag(vcovCL(m3_cl, cluster = df$country, type = "HC0")))
se_m4_cl <- sqrt(diag(vcovCL(m4_cl, cluster = df$country, type = "HC0")))
se_m5_cl <- sqrt(diag(vcovCL(m5_cl, cluster = df$country, type = "HC0")))

stargazer(m1_cl, m2_cl, m3_cl, m4_cl, m5_cl, type = "latex",
          se = list(se_m1_cl, se_m2_cl, se_m3_cl, se_m4_cl, se_m5_cl),
          omit = "country", 
          covariate.labels = c(
                               "Anti-Government Protests", 
                               "Armed Conflict Internal", 
                               "CSO Repression",
                               "3 Years Post Change HOS Term Limit",
                               "Protest in Region", 
                               "Armed Conflict in Region",
                               "Restrictive I. Regulation in Region",
                               "Population (ln)",
                               "Share of Internet Users",
                               "Youth Bulge",
                               "Share of Urban Population"
          ),
          dep.var.labels = c("Potentially Restrictive Internet Legislation")
)


### 4.2 OLS with FE simulating logit FE 
## to simulate logit bin with FE, OLS with FE excluding countries which never enacted restrictive regulation 
# see https://arxiv.org/abs/1810.12105 

## Filter out countries where reg_res is never 1
filtered_data <- df %>%
  group_by(country) %>%       # Group by country
  filter(any(reg_res == 1)) %>% # Keep only groups where reg_res has at least one 1
  ungroup()                   # Remove grouping
df %>% group_by(country) %>% filter(!any(reg_res == 1)) %>% view()
# removes Equatorial Guinea and Guinea-Bissau

## Deterring Internal Threat 
summary(m1_ols <- lm(reg_res ~ protests_cgov_lag + any_conflict_lag + 
                       total_pop_log + i_users_lag + youth_bulge + urban_pop + 
                       factor(country) + factor(year),
                     data = filtered_data))

## Expanding Analog Regulation 
summary(m2_ols <- lm(reg_res ~ v2csreprss_ord_lag + v2exfxtmhs_3_lag + 
                       total_pop_log + i_users_lag + youth_bulge + urban_pop + 
                       factor(country) + factor(year),
                     data = filtered_data)) 

## Fending Off Threats Abroad 
summary(m3_ols <- lm(reg_res ~ weighted_protests_cgov_950_lag + weighted_any_conflict_950_lag + 
                       total_pop_log + i_users_lag + youth_bulge + urban_pop + 
                       factor(country) + factor(year),
                     data = filtered_data)) 

## Learning from Neighbors
summary(m4_ols <- lm(reg_res ~ weighted_reg_res_numerical_950_lag + 
                       total_pop_log + i_users_lag + youth_bulge + urban_pop + 
                       factor(country) + factor(year),
                     data = filtered_data)) 

## All together 
summary(m5_ols <- lm(reg_res ~ protests_cgov_lag + any_conflict_lag +
                       v2csreprss_ord_lag + v2exfxtmhs_3_lag + 
                       weighted_protests_cgov_950_lag + weighted_any_conflict_950_lag +  
                       weighted_reg_res_numerical_950_lag + 
                       total_pop_log + i_users_lag + youth_bulge + urban_pop + 
                       factor(country) + factor(year),
                     data = filtered_data))

## Export 
stargazer(m1_ols, m2_ols, m3_ols, m4_ols, m5_ols, type = "latex",
          omit = c("country", "year"),  
          covariate.labels = c(
            "Anti-Government Protests", 
            "Armed Conflict Internal", 
            "CSO Repression",
            "3 Years Post Change HOS Term Limit",
            "Protest in Region", 
            "Armed Conflict in Region",
            "Restrictive I. Regulation in Region",
            "Population (ln)",
            "Share of Internet Users",
            "Youth Bulge",
            "Share of Urban Population"
          ),
          dep.var.labels = c("Potentially Restrictive Internet Legislation")
)


### 4.3 OLS, autocracies only 
# create filtered df 
countries_to_exclude <- df %>%
  filter(year == 2001 & v2x_regime_lag == 0) %>%
  distinct(country) 
filtered_data <- df %>%
  anti_join(countries_to_exclude, by = "country")

## Deterring Internal Threat 
summary(m1_ols <- lm(reg_res ~ protests_cgov_lag + any_conflict_lag + 
                       total_pop_log + i_users_lag + youth_bulge + urban_pop + 
                       factor(country) + factor(year),
                     data = filtered_data))

## Expanding Analog Regulation 
summary(m2_ols <- lm(reg_res ~ v2csreprss_ord_lag + v2exfxtmhs_3_lag + 
                       total_pop_log + i_users_lag + youth_bulge + urban_pop + 
                       factor(country) + factor(year),
                     data = filtered_data)) 

## Fending Off Threats Abroad 
summary(m3_ols <- lm(reg_res ~ weighted_protests_cgov_950_lag + weighted_any_conflict_950_lag + 
                       total_pop_log + i_users_lag + youth_bulge + urban_pop + 
                       factor(country) + factor(year),
                     data = filtered_data)) 

## Learning from Neighbors
summary(m4_ols <- lm(reg_res ~ weighted_reg_res_numerical_950_lag + 
                       total_pop_log + i_users_lag + youth_bulge + urban_pop + 
                       factor(country) + factor(year),
                     data = filtered_data)) 

## All together 
summary(m5_ols <- lm(reg_res ~ protests_cgov_lag + any_conflict_lag +
                       v2csreprss_ord_lag + v2exfxtmhs_3_lag + 
                       weighted_protests_cgov_950_lag + weighted_any_conflict_950_lag +  
                       weighted_reg_res_numerical_950_lag + 
                       total_pop_log + i_users_lag + youth_bulge + urban_pop + 
                       factor(country) + factor(year),
                     data = filtered_data))

## Export 
stargazer(m1_ols, m2_ols, m3_ols, m4_ols, m5_ols, type = "latex",
          omit = c("country", "year"),  
          covariate.labels = c(
            "Anti-Government Protests", 
            "Armed Conflict Internal", 
            "CSO Repression",
            "3 Years Post Change HOS Term Limit",
            "Protest in Region", 
            "Armed Conflict in Region",
            "Restrictive I. Regulation in Region",
            "Population (ln)",
            "Share of Internet Users",
            "Youth Bulge",
            "Share of Urban Population"
          ),
          dep.var.labels = c("Potentially Restrictive Internet Legislation")
)


#### 4.5 Descriptive Analysis 

# Define binary and numerical variables
binary_vars <- c("any_conflict_lag", "v2exfxtmhs_3_lag")
numerical_vars <- c("protests_cgov_lag", "v2csreprss_ord_lag", 
                    "weighted_protests_cgov_950_lag", "weighted_any_conflict_950_lag",
                    "weighted_reg_res_numerical_950_lag")

# Calculate mean thresholds for numerical variables
mean_thresholds <- df %>%
  summarise(across(all_of(numerical_vars), ~ mean(.x, na.rm = TRUE)))

# Perform binary and numerical analyses, calculating the difference
binary_analysis_diff <- df %>%
  group_by(reg_res) %>%
  summarise(across(all_of(binary_vars), ~ mean(. == 1, na.rm = TRUE) * 100)) %>%
  pivot_longer(cols = -reg_res, names_to = "Variable", values_to = "Probability (%)") %>%
  pivot_wider(names_from = reg_res, values_from = `Probability (%)`) %>%
  mutate(Difference = `1` - `0`) %>%
  dplyr::select(Variable, Difference)

numerical_analysis_diff <- df %>%
  group_by(reg_res) %>%
  summarise(across(all_of(numerical_vars), 
                   ~ mean(.x > mean_thresholds[[cur_column()]], na.rm = TRUE) * 100)) %>%
  pivot_longer(cols = -reg_res, names_to = "Variable", values_to = "Probability (%)") %>%
  pivot_wider(names_from = reg_res, values_from = `Probability (%)`) %>%
  mutate(Difference = `1` - `0`) %>%
  dplyr::select(Variable, Difference)

# Combine and label results for differences
variable_labels <- c(
  "protests_cgov_lag" = "Anti-Government Protests",
  "any_conflict_lag" = "Armed Conflict Internal",
  "v2csreprss_ord_lag" = "CSO Repression",
  "v2exfxtmhs_3_lag" = "3 Years Post Change HOS Term Limit",
  "weighted_protests_cgov_950_lag" = "Protest in Region",
  "weighted_any_conflict_950_lag" = "Armed Conflict in Region",
  "weighted_reg_res_numerical_950_lag" = "Restrictive I. Regulation in Region"
)

desired_order <- names(variable_labels)

combined_results_diff <- bind_rows(binary_analysis_diff, numerical_analysis_diff) %>%
  mutate(
    Variable = recode(Variable, !!!variable_labels), 
    Variable = factor(Variable, levels = variable_labels) 
  ) %>%
  arrange(Variable) %>%
  dplyr::select(Variable, Difference)

# Prepare data for stargazer
colnames(combined_results_diff) <- c("Independent Variables", "Difference in Sahre t-1 (%)")
stargazer_matrix <- as.matrix(combined_results_diff)

# Generate LaTeX table with stargazer
stargazer(stargazer_matrix, 
          summary = FALSE, 
          rownames = FALSE, 
          title = "Descriptive Statistics: Differences in Probabilities Between Laws Enacted and Not Enacted",
          label = "tab:descriptive_stats_diff",
          style = "default",
          header = FALSE)


### 4.5 checking descriptives for CSO repression and HOS term limit changes by countries 

## CSO repression 
df %>% 
  dplyr::select(country, year, v2csreprss_ord_lag) %>% 
  filter(v2csreprss_ord_lag >= 3) %>% 
  view() 

## changes in HOS term limit 
df %>% dplyr::select(country, year, v2exfxtmhs_3_lag, v2exfxtmhs_1_lag) %>% view() 

# Calculate total changes in 'v2exfxtmhs_3_lag' for each country
country_totals <- df %>%
  group_by(country) %>%
  summarise(total_changes = sum(v2exfxtmhs_1_lag, na.rm = TRUE), .groups = "drop")

# Calculate the overall mean of total changes across countries
mean_total_changes <- country_totals %>%
  summarise(mean_changes = mean(total_changes, na.rm = TRUE)) %>%
  pull(mean_changes)

# Identify countries with total changes above the mean
countries_above_mean <- country_totals %>%
  filter(total_changes > mean_total_changes)

# Display the results
countries_above_mean



#### STEP 5 Robustness Checks - Different Measures ---------------------------------------------------------

### 5.1 same IVs new controls 

## total_pop_log + english_main + gdp_pcap_log + mphone_users_s_lag

## Main model 
names(df)

### Deterring Internal Threat 
summary(m1_ols <- lm(reg_res ~ protests_cgov_lag + any_conflict_lag + 
                       total_pop_log + english_main + gdp_pcap_log + mphone_users_s_lag +
                       factor(country) + factor(year),
                     data = df))

### Updating
summary(m2_ols <- lm(reg_res ~ v2csreprss_ord_lag + v2exfxtmhs_3_lag +
                       total_pop_log + english_main + gdp_pcap_log + mphone_users_s_lag +
                       factor(country) + factor(year),
                     data = df)) 

### Deterring Threats Abroad 
summary(m3_ols <- lm(reg_res ~ weighted_protests_cgov_950_lag + weighted_any_conflict_950_lag +
                       total_pop_log + english_main + gdp_pcap_log + mphone_users_s_lag +
                       factor(country) + factor(year),
                     data = df)) 

### Learning 
summary(m4_ols <- lm(reg_res ~ weighted_reg_res_numerical_950_lag + 
                       total_pop_log + english_main + gdp_pcap_log + mphone_users_s_lag +
                       factor(country) + factor(year),
                     data = df)) 

### All together 
summary(m5_ols <- lm(reg_res ~ protests_cgov_lag + any_conflict_lag +
                       v2csreprss_ord_lag + v2exfxtmhs_3_lag + 
                       weighted_protests_cgov_950_lag + weighted_any_conflict_950_lag +  
                       weighted_reg_res_numerical_950_lag + 
                       total_pop_log + english_main + gdp_pcap_log + mphone_users_s_lag + 
                       factor(country) + factor(year),
                     data = df))

### export with stargazer 
stargazer(m1_ols, m2_ols, m3_ols, m4_ols, m5_ols, type = "latex",
          omit = c("country", "year"),  
          covariate.labels = c(
            "Anti-Government Protests", 
            "Armed Conflict Internal", 
            "CSO Repression",
            "3 Years Post Change HOS Term Limit",
            "Protest in Region", 
            "Armed Conflict in Region",
            "Restrictive I. Regulation in Region",
            "Population (ln)",
            "English Main Lang.",
            "GDP per capita (ln)",
            "Share of Mobile Phone Users"
          ),
          dep.var.labels = c("Potentially Restrictive Internet Legislation")
)


### 5.2 new measures for IVs 

## Deterring: ACLED + conflict 
## Updating: 1 year post HOS change + electoral autocracy 
## Deterring external: ACLED in Region + conflict in region 
## Learning: Restrictive Reg + BRI + Wagner 
names(df)
## Deterring  
summary(m1_ols <- lm(reg_res ~ acled_events_log_lag + any_conflict_lag + 
                       total_pop_log + i_users_lag + youth_bulge + urban_pop + 
                       factor(country) + factor(year),
                     data = df))

## Updating 
summary(m2_ols <- lm(reg_res ~  v2exfxtmhs_1_lag + v2x_regime_bin_ea_lag + 
                       total_pop_log + i_users_lag + youth_bulge + urban_pop + 
                       factor(country) + factor(year),
                     data = df))

## Deterring Threats abroad 
summary(m3_ols <- lm(reg_res ~ weighted_acled_events_log_950_lag + weighted_any_conflict_950_lag +   
                       total_pop_log + i_users_lag + youth_bulge + urban_pop + 
                       factor(country) + factor(year),
                     data = df))

## Learning 
summary(m4_ols <- lm(reg_res ~ weighted_reg_res_numerical_950_lag + bri_effect_lag + wagner_count_lag +
                       total_pop_log + i_users_lag + youth_bulge + urban_pop + 
                       factor(country) + factor(year),
                     data = df))

## All together 
summary(m5_ols <- lm(reg_res ~ acled_events_log_lag + any_conflict_lag +
                       v2exfxtmhs_1_lag + v2x_regime_bin_ea_lag + 
                       weighted_acled_events_log_950_lag +  weighted_any_conflict_950_lag +  
                       weighted_reg_res_numerical_950_lag + bri_effect_lag + wagner_count_lag +
                       total_pop_log + i_users_lag + youth_bulge + urban_pop + 
                       factor(country) + factor(year),
                     data = df))

stargazer(m1_ols, m2_ols, m3_ols, m4_ols, m5_ols, type = "latex",
          omit = c("country", "year"),  
          covariate.labels = c(
            "Protests (ACLED)", 
            "Armed Conflict Internal", 
            "1 Years Post Change HOS Term Limit",
            "Electoral Autocracy",
            "Protests in Region (ACLED)", 
            "Armed Conflict in Region",
            "Restrictive I. Regulation in Region",
            "3 Years Post Join BRI",
            "Wagner Group Actions in Country",
            "Population (ln)",
            "Share of Internet Users",
            "Youth Bulge",
            "Share of Urban Population"
          ),
          dep.var.labels = c("Potentially Restrictive Internet Legislation")
)


### 5.3 ACLED in Region iterated across distances

# Define the distance thresholds
distance_thresholds <- c(50, 550, 950, 1450, 1950, 2350)

# Initialize a list to store results
results <- list()

# Loop through each distance threshold
for (distance in distance_thresholds) {
  # Define the variable name for the lagged distance measure
  var_name <- paste0("weighted_acled_events_log_", distance, "_lag")
  
  # Fit the OLS model with country and year fixed effects
  model <- lm(
    as.formula(paste0("reg_res ~ ", var_name, "+ weighted_any_conflict_950_lag +  
                       total_pop_log + i_users_lag + youth_bulge + urban_pop + 
                       factor(country) + factor(year)")),
    data = df
  )
  
  # Save the model in the list
  results[[as.character(distance)]] <- model
}

# Extract coefficients and standard errors for each model
coefficients <- sapply(distance_thresholds, function(distance) {
  model <- results[[as.character(distance)]]
  summary_model <- summary(model)
  
  # Find the row for the relevant variable
  coef_index <- grep(paste0("weighted_acled_events_log_", distance, "_lag"), rownames(summary_model$coefficients))
  
  # Extract estimate and standard error
  c(Estimate = summary_model$coefficients[coef_index, "Estimate"],
    Std_Error = summary_model$coefficients[coef_index, "Std. Error"])
})

# Convert to a data frame
coeff_df <- data.frame(
  Distance = distance_thresholds,
  Estimate = coefficients["Estimate", ],
  Std_Error = coefficients["Std_Error", ]
)

# Plot
ggplot(coeff_df, aes(x = Distance, y = Estimate)) +
  geom_point() +
  geom_errorbar(aes(ymin = Estimate - 1.96 * Std_Error, ymax = Estimate + 1.96 * Std_Error)) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
  labs(
    x = "Distance Threshold (km)",
    y = "Coefficient Estimate"
  ) +
  theme_minimal() +
  theme(legend.position = "bottom",
        text = element_text(size = 18, family = "Times"),  # Set overall text size to 14 and font to Times Roman
        axis.text.x = element_text(size = 16, family = "Times"),  # Make axis numbers bigger and set font
        axis.text.y = element_text(size = 16, family = "Times"),
        legend.text = element_text(size = 16, family = "Times"),
        legend.title = element_blank())


#### STEP 6 Descriptives -------------------------------------------------------------------------------

rm(list = ls())
# reload data for total timespan 
df <- read.csv("243884816.R1_df.csv")

### 6.1 within 950k of Uganda borders in 2010s 
# countries in 950k range Uganda: Burundi, Central African Republic, Democratic Republic of Congo, Ethiopia, Kenya, Malawi, Rwanda, Somalia, South-Sudan, Tanzania, Zambia 
uganda_neighbors <- c("Burundi", "Central African Republic", "Democratic Republic of Congo", "Ethiopia", 
                      "Kenya", "Malawi", "Rwanda", "Somalia", "South Sudan", "Tanzania", "Uganda",  "Zambia")
# filter countries and time period  
df_map <- df %>%
  filter(country %in% uganda_neighbors) %>% 
  mutate(time_period = case_when(
    year >= 2010 & year <= 2022 ~ "2010-2022",
    TRUE ~ "other"  
  )) %>%
  group_by(country, time_period) %>%
  summarise(total_legislation = sum(reg_res_numerical, na.rm = TRUE),
            iso3c = first(iso3c)) %>%
  ungroup()

# Fetching the shapefile for sub-Saharan African countries
world <- ne_countries(scale = "medium", returnclass = "sf")
# create list with iso3c codes from my data 
iso3c <- unique(df$iso3c)

# filter world 
sub_saharan <- world %>%
  filter(iso_a3 %in% iso3c)

# Joining the aggregated data with the sub-Saharan shapefile
# need to match by iso3c 
map_data <- sub_saharan %>%
  left_join(df_map, by = c("iso_a3" = "iso3c")) # Adjust "name" to match the country name column in your dataset

# adjust map_data to get different scale 
table(map_data$total_legislation, useNA = "always")
map_data <- map_data %>%
  mutate(legislation_category = case_when(
    total_legislation == 0 ~ "0",
    total_legislation >= 1 & total_legislation <= 3 ~ "1-3",
    total_legislation >= 4 & total_legislation <= 6 ~ "4-6",
    total_legislation >= 7 ~ "7-10",
    TRUE ~ NA_character_ # for any case that doesn't match above, though there shouldn't be any
  ))

# add shape of southern Africa
southern_africa_context <- world %>%
  filter(subregion %in% c("Eastern Africa", "Middle Africa", "Southern Africa"))  

# Map plotting
ggplot() +
  geom_sf(data = southern_africa_context, fill = "grey90", color = "white", size = 0.25) + # Background context
  geom_sf(data = map_data %>% filter(time_period == "2010-2022"), aes(fill = legislation_category), color = "white") +
  scale_fill_manual(values = c("0" = "grey40", "1-3" = "darksalmon", "4-6" = "brown3", "7-10" = "darkred"),
                    na.value = "lightgrey", guide = "legend") +
  labs(fill = "Total Restrictive Legislation") +
  theme_void() +
  theme(legend.position = "right")


### 6.2 Plot restrictive internet regulations and internet users 

## plotting reg, reg_res, and i_users_per
names(df)
table(df$i_users_per, useNA = "always")

# Summarize data by year
yearly_summary <- df %>%
  group_by(year) %>%
  summarise(
    reg_res = sum(reg_res, na.rm = TRUE),
    i_users_per = mean(i_users_per, na.rm = TRUE))  

# Adjusting colors and linetypes for 'reg_res' and setting up 'i_users_per' for bar plot
category_colors <- c("Restrictive Regulations" = "firebrick", "Internet Users Share" = "lightgrey")
category_linetypes <- c("Restrictive Regulations" = "solid")

# plot 
p1 <- ggplot(yearly_summary, aes(x = year)) +
  geom_col(aes(y = i_users_per, fill = "Internet Users Share"), color = "lightgrey", width = 0.5) +
  geom_line(aes(y = reg_res, color = "Restrictive Regulations", linetype = "Restrictive Regulations")) +
  scale_color_manual(values = category_colors) +
  scale_fill_manual(values = "lightgrey") +
  scale_linetype_manual(values = category_linetypes) +
  labs(x = "", 
       y = "Share (%) & Count", 
       title = "",  # Removed the title
       color = "",  # Removed 'Category' label from legend for color
       linetype = "",  # Removed 'Category' label from legend for linetype
       fill = "") +  # Removed 'Category' label from legend for fill
  theme_minimal() +
  theme(legend.position = "bottom",
        text = element_text(size = 18, family = "Times"),  # Set overall text size to 14 and font to Times Roman
        axis.text.x = element_text(size = 16, family = "Times"),  # Make axis numbers bigger and set font
        axis.text.y = element_text(size = 16, family = "Times"),
        legend.text = element_text(size = 16, family = "Times"),
        legend.title = element_blank())  # Remove legend titles

# Display the adjusted plot
print(p1)


### 6.3 plotting average number of restrictive regulations per regime type
table(df$regime) # need to normalize by number of regime 

# Summarize the data for autocracies and democracies
summary_data <- df %>%
  mutate(regime_type = ifelse(regime %in% c(0, 1), "Autocracy", 
                              ifelse(regime %in% c(2, 3), "Democracy", NA))) %>%
  filter(!is.na(regime_type)) %>%
  group_by(regime_type) %>%
  summarise(total_reg_res = sum(reg_res),
            count = n()) %>%
  mutate(reg_res_per_regime = total_reg_res / count)

# Define colors and linetypes
category_colors <- c("Autocracy" = "grey20", "Democracy" = "lightgrey")
category_linetypes <- c("Autocracy" = "solid", "Democracy" = "solid")

# Create the barplot with specified aesthetics
p2 <- ggplot(summary_data, aes(x = regime_type)) +
  geom_col(aes(y = reg_res_per_regime, fill = regime_type), color = category_colors, width = 0.4) +
  scale_fill_manual(values = category_colors) +
  labs(x = "", 
       y = "Count / Number of Regimes", 
       title = "",  
       fill = "") + 
  theme_minimal() +
  theme(legend.position = "bottom",
        text = element_text(size = 18, family = "Times"), 
        axis.text.x = element_text(size = 16, family = "Times", color = "white"),  
        axis.text.y = element_text(size = 16, family = "Times"),
        legend.text = element_text(size = 16, family = "Times"),
        legend.title = element_blank()) +
  coord_cartesian(xlim = c(0.75, 2.25))

# Print the plot
print(p2)

# export both together for F1 in text 
grid.arrange(p1, p2, ncol = 2, widths = c(1.5, 1))


### 6.4 Restrictive regulations by country 
# 1. Summarize the data
d_plot <- df %>%
  group_by(country) %>%
  summarise(
    total_laws = sum(reg_numerical, na.rm = TRUE),           # Total laws
    restrictive_laws = sum(reg_res_numerical, na.rm = TRUE), # Restrictive laws
    non_restrictive_laws = total_laws - restrictive_laws     # Non-restrictive laws
  ) %>%
  arrange(desc(total_laws), desc(restrictive_laws)) %>%      # Order countries by total laws, then restrictive laws
  mutate(country = factor(country, levels = rev(unique(country)))) # Reverse levels for plotting

# 2. Reshape data for plotting
d_plot_long <- d_plot %>%
  pivot_longer(
    cols = c(restrictive_laws, non_restrictive_laws),
    names_to = "regulation_type",
    values_to = "count"
  ) %>%
  mutate(
    regulation_type = factor(
      regulation_type,
      levels = c("non_restrictive_laws", "restrictive_laws") # Ensure stacking order: Restrictive first
    )
  )

# 3. Plot the data
ggplot(d_plot_long, aes(x = country, y = count, fill = regulation_type)) +
  geom_bar(stat = "identity", position = "stack", color = "white", size = 0.65) + # Add a border for clarity
  scale_fill_manual(
    values = c(
      "restrictive_laws" = "black",
      "non_restrictive_laws" = "grey90"
    ),
    labels = c("Not Restrictive", "Potentially Restrictive") # Rename legend labels
  ) +
  labs(
    x = "Country",
    y = "Number of Regulations",
    fill = "Internet Regulations:"
  ) +
  theme_minimal() +
  theme(
    text = element_text(family = "Times", size = 14),
    axis.title = element_text(family = "Times", size = 16, face = "bold"),
    axis.text = element_text(family = "Times", size = 14),
    legend.title = element_text(family = "Times", size = 16, face = "bold"),
    legend.text = element_text(family = "Times", size = 16),
    axis.text.y = element_text(family = "Times", size = 14),
    legend.spacing.y = unit(0.3, "cm"),
    legend.key = element_rect(colour = "white", size = 1.25),
    legend.key.height = unit(0.5, "cm"),
    legend.key.width = unit(0.75, "cm")
  ) +
  coord_flip()

# Save the plot
ggsave("regulations_by_country_final.pdf", plot = last_plot(), width = 10, height = 8)

